aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/NVPTX
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/NVPTX')
-rw-r--r--llvm/test/CodeGen/NVPTX/bug26185-2.ll9
-rw-r--r--llvm/test/CodeGen/NVPTX/combine-wide.ll1339
-rw-r--r--llvm/test/CodeGen/NVPTX/fold-movs.ll38
-rw-r--r--llvm/test/CodeGen/NVPTX/i8x4-instructions.ll168
-rw-r--r--llvm/test/CodeGen/NVPTX/ld-param-sink.ll47
-rw-r--r--llvm/test/CodeGen/NVPTX/local-stack-frame.ll7
-rw-r--r--llvm/test/CodeGen/NVPTX/vector-loads.ll11
7 files changed, 1536 insertions, 83 deletions
diff --git a/llvm/test/CodeGen/NVPTX/bug26185-2.ll b/llvm/test/CodeGen/NVPTX/bug26185-2.ll
index 4e11f58..46172b1 100644
--- a/llvm/test/CodeGen/NVPTX/bug26185-2.ll
+++ b/llvm/test/CodeGen/NVPTX/bug26185-2.ll
@@ -16,7 +16,7 @@ define ptx_kernel void @spam(ptr addrspace(1) noalias nocapture readonly %arg, p
; CHECK: .maxntid 1, 1, 1
; CHECK-NEXT: {
; CHECK-NEXT: .reg .b32 %r<2>;
-; CHECK-NEXT: .reg .b64 %rd<9>;
+; CHECK-NEXT: .reg .b64 %rd<8>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: // %bb
; CHECK-NEXT: ld.param.b64 %rd1, [spam_param_0];
@@ -25,10 +25,9 @@ define ptx_kernel void @spam(ptr addrspace(1) noalias nocapture readonly %arg, p
; CHECK-NEXT: add.s64 %rd4, %rd1, %rd3;
; CHECK-NEXT: ld.param.b64 %rd5, [spam_param_1];
; CHECK-NEXT: ld.global.nc.s16 %r1, [%rd4+16];
-; CHECK-NEXT: mul.wide.s32 %rd6, %r1, %r1;
-; CHECK-NEXT: ld.global.b64 %rd7, [%rd5];
-; CHECK-NEXT: add.s64 %rd8, %rd6, %rd7;
-; CHECK-NEXT: st.global.b64 [%rd5], %rd8;
+; CHECK-NEXT: ld.global.b64 %rd6, [%rd5];
+; CHECK-NEXT: mad.wide.s32 %rd7, %r1, %r1, %rd6;
+; CHECK-NEXT: st.global.b64 [%rd5], %rd7;
; CHECK-NEXT: ret;
bb:
%tmp5 = add nsw i64 %arg3, 8
diff --git a/llvm/test/CodeGen/NVPTX/combine-wide.ll b/llvm/test/CodeGen/NVPTX/combine-wide.ll
new file mode 100644
index 0000000..ed4a2b6
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/combine-wide.ll
@@ -0,0 +1,1339 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -O1 | FileCheck %s --check-prefixes=CHECK,O1
+; RUN: llc < %s -O0 | FileCheck %s --check-prefixes=CHECK,O0
+
+target triple = "nvptx64-nvidia-cuda"
+
+define i64 @t1(i32 %a, i32 %b, i64 %c) {
+;
+; O1-LABEL: t1(
+; O1: {
+; O1-NEXT: .reg .b32 %r<3>;
+; O1-NEXT: .reg .b64 %rd<3>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b32 %r1, [t1_param_0];
+; O1-NEXT: ld.param.b32 %r2, [t1_param_1];
+; O1-NEXT: ld.param.b64 %rd1, [t1_param_2];
+; O1-NEXT: mad.wide.s32 %rd2, %r1, %r2, %rd1;
+; O1-NEXT: st.param.b64 [func_retval0], %rd2;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t1(
+; O0: {
+; O0-NEXT: .reg .b32 %r<4>;
+; O0-NEXT: .reg .b64 %rd<4>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b64 %rd1, [t1_param_2];
+; O0-NEXT: ld.param.b32 %r2, [t1_param_1];
+; O0-NEXT: ld.param.b32 %r1, [t1_param_0];
+; O0-NEXT: mul.lo.s32 %r3, %r1, %r2;
+; O0-NEXT: cvt.s64.s32 %rd2, %r3;
+; O0-NEXT: add.s64 %rd3, %rd1, %rd2;
+; O0-NEXT: st.param.b64 [func_retval0], %rd3;
+; O0-NEXT: ret;
+ %mul = mul nsw i32 %a, %b
+ %sext = sext i32 %mul to i64
+ %add = add i64 %c, %sext
+ ret i64 %add
+}
+
+define i64 @t2(i32 %a, i32 %b, i64 %c) {
+;
+; O1-LABEL: t2(
+; O1: {
+; O1-NEXT: .reg .b32 %r<3>;
+; O1-NEXT: .reg .b64 %rd<3>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b32 %r1, [t2_param_0];
+; O1-NEXT: ld.param.b32 %r2, [t2_param_1];
+; O1-NEXT: ld.param.b64 %rd1, [t2_param_2];
+; O1-NEXT: mad.wide.s32 %rd2, %r1, %r2, %rd1;
+; O1-NEXT: st.param.b64 [func_retval0], %rd2;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t2(
+; O0: {
+; O0-NEXT: .reg .b32 %r<4>;
+; O0-NEXT: .reg .b64 %rd<4>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b64 %rd1, [t2_param_2];
+; O0-NEXT: ld.param.b32 %r2, [t2_param_1];
+; O0-NEXT: ld.param.b32 %r1, [t2_param_0];
+; O0-NEXT: mul.lo.s32 %r3, %r1, %r2;
+; O0-NEXT: cvt.s64.s32 %rd2, %r3;
+; O0-NEXT: add.s64 %rd3, %rd2, %rd1;
+; O0-NEXT: st.param.b64 [func_retval0], %rd3;
+; O0-NEXT: ret;
+ %mul = mul nsw i32 %a, %b
+ %sext = sext i32 %mul to i64
+ %add = add i64 %sext, %c
+ ret i64 %add
+}
+
+define i64 @t3(i32 %a, i32 %b) {
+;
+; O1-LABEL: t3(
+; O1: {
+; O1-NEXT: .reg .b32 %r<3>;
+; O1-NEXT: .reg .b64 %rd<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b32 %r1, [t3_param_0];
+; O1-NEXT: ld.param.b32 %r2, [t3_param_1];
+; O1-NEXT: mad.wide.s32 %rd1, %r1, %r2, 1;
+; O1-NEXT: st.param.b64 [func_retval0], %rd1;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t3(
+; O0: {
+; O0-NEXT: .reg .b32 %r<4>;
+; O0-NEXT: .reg .b64 %rd<3>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b32 %r2, [t3_param_1];
+; O0-NEXT: ld.param.b32 %r1, [t3_param_0];
+; O0-NEXT: mul.lo.s32 %r3, %r1, %r2;
+; O0-NEXT: cvt.s64.s32 %rd1, %r3;
+; O0-NEXT: add.s64 %rd2, %rd1, 1;
+; O0-NEXT: st.param.b64 [func_retval0], %rd2;
+; O0-NEXT: ret;
+ %mul = mul nsw i32 %a, %b
+ %sext = sext i32 %mul to i64
+ %add = add i64 1, %sext
+ ret i64 %add
+}
+
+define i64 @t4(i32 %a, i64 %c) {
+;
+; O1-LABEL: t4(
+; O1: {
+; O1-NEXT: .reg .b32 %r<2>;
+; O1-NEXT: .reg .b64 %rd<3>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b32 %r1, [t4_param_0];
+; O1-NEXT: ld.param.b64 %rd1, [t4_param_1];
+; O1-NEXT: mad.wide.s32 %rd2, %r1, 3, %rd1;
+; O1-NEXT: st.param.b64 [func_retval0], %rd2;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t4(
+; O0: {
+; O0-NEXT: .reg .b32 %r<3>;
+; O0-NEXT: .reg .b64 %rd<4>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b64 %rd1, [t4_param_1];
+; O0-NEXT: ld.param.b32 %r1, [t4_param_0];
+; O0-NEXT: mul.lo.s32 %r2, %r1, 3;
+; O0-NEXT: cvt.s64.s32 %rd2, %r2;
+; O0-NEXT: add.s64 %rd3, %rd1, %rd2;
+; O0-NEXT: st.param.b64 [func_retval0], %rd3;
+; O0-NEXT: ret;
+ %mul = mul nsw i32 %a, 3
+ %sext = sext i32 %mul to i64
+ %add = add i64 %c, %sext
+ ret i64 %add
+}
+
+define i64 @t4_1(i32 %a, i64 %c) {
+;
+; O1-LABEL: t4_1(
+; O1: {
+; O1-NEXT: .reg .b32 %r<2>;
+; O1-NEXT: .reg .b64 %rd<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b32 %r1, [t4_1_param_0];
+; O1-NEXT: mad.wide.s32 %rd1, %r1, 3, 5;
+; O1-NEXT: st.param.b64 [func_retval0], %rd1;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t4_1(
+; O0: {
+; O0-NEXT: .reg .b32 %r<3>;
+; O0-NEXT: .reg .b64 %rd<3>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b32 %r1, [t4_1_param_0];
+; O0-NEXT: mul.lo.s32 %r2, %r1, 3;
+; O0-NEXT: cvt.s64.s32 %rd1, %r2;
+; O0-NEXT: add.s64 %rd2, %rd1, 5;
+; O0-NEXT: st.param.b64 [func_retval0], %rd2;
+; O0-NEXT: ret;
+ %mul = mul nsw i32 %a, 3
+ %sext = sext i32 %mul to i64
+ %add = add i64 5, %sext
+ ret i64 %add
+}
+
+define i64 @t5(i32 %a, i32 %b, i64 %c) {
+;
+; O1-LABEL: t5(
+; O1: {
+; O1-NEXT: .reg .b32 %r<3>;
+; O1-NEXT: .reg .b64 %rd<3>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b32 %r1, [t5_param_0];
+; O1-NEXT: ld.param.b32 %r2, [t5_param_1];
+; O1-NEXT: ld.param.b64 %rd1, [t5_param_2];
+; O1-NEXT: mad.wide.u32 %rd2, %r1, %r2, %rd1;
+; O1-NEXT: st.param.b64 [func_retval0], %rd2;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t5(
+; O0: {
+; O0-NEXT: .reg .b32 %r<4>;
+; O0-NEXT: .reg .b64 %rd<4>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b64 %rd1, [t5_param_2];
+; O0-NEXT: ld.param.b32 %r2, [t5_param_1];
+; O0-NEXT: ld.param.b32 %r1, [t5_param_0];
+; O0-NEXT: mul.lo.s32 %r3, %r1, %r2;
+; O0-NEXT: cvt.u64.u32 %rd2, %r3;
+; O0-NEXT: add.s64 %rd3, %rd1, %rd2;
+; O0-NEXT: st.param.b64 [func_retval0], %rd3;
+; O0-NEXT: ret;
+ %mul = mul nuw i32 %a, %b
+ %zext = zext i32 %mul to i64
+ %add = add i64 %c, %zext
+ ret i64 %add
+}
+
+define i64 @t6(i32 %a, i32 %b, i64 %c) {
+;
+; O1-LABEL: t6(
+; O1: {
+; O1-NEXT: .reg .b32 %r<3>;
+; O1-NEXT: .reg .b64 %rd<3>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b32 %r1, [t6_param_0];
+; O1-NEXT: ld.param.b32 %r2, [t6_param_1];
+; O1-NEXT: ld.param.b64 %rd1, [t6_param_2];
+; O1-NEXT: mad.wide.u32 %rd2, %r1, %r2, %rd1;
+; O1-NEXT: st.param.b64 [func_retval0], %rd2;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t6(
+; O0: {
+; O0-NEXT: .reg .b32 %r<4>;
+; O0-NEXT: .reg .b64 %rd<4>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b64 %rd1, [t6_param_2];
+; O0-NEXT: ld.param.b32 %r2, [t6_param_1];
+; O0-NEXT: ld.param.b32 %r1, [t6_param_0];
+; O0-NEXT: mul.lo.s32 %r3, %r1, %r2;
+; O0-NEXT: cvt.u64.u32 %rd2, %r3;
+; O0-NEXT: add.s64 %rd3, %rd2, %rd1;
+; O0-NEXT: st.param.b64 [func_retval0], %rd3;
+; O0-NEXT: ret;
+ %mul = mul nuw i32 %a, %b
+ %zext = zext i32 %mul to i64
+ %add = add i64 %zext, %c
+ ret i64 %add
+}
+
+define i32 @t7(i16 %a, i16 %b) {
+;
+; O1-LABEL: t7(
+; O1: {
+; O1-NEXT: .reg .b16 %rs<4>;
+; O1-NEXT: .reg .b32 %r<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b16 %rs1, [t7_param_0];
+; O1-NEXT: ld.param.b16 %rs2, [t7_param_1];
+; O1-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O1-NEXT: cvt.u32.u16 %r1, %rs3;
+; O1-NEXT: st.param.b32 [func_retval0], %r1;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t7(
+; O0: {
+; O0-NEXT: .reg .b16 %rs<4>;
+; O0-NEXT: .reg .b32 %r<2>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b16 %rs2, [t7_param_1];
+; O0-NEXT: ld.param.b16 %rs1, [t7_param_0];
+; O0-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O0-NEXT: cvt.u32.u16 %r1, %rs3;
+; O0-NEXT: st.param.b32 [func_retval0], %r1;
+; O0-NEXT: ret;
+ %mul = mul i16 %a, %b
+ %zext = zext i16 %mul to i32
+ ret i32 %zext
+}
+
+define i32 @t8(i16 %a, i16 %b) {
+;
+; O1-LABEL: t8(
+; O1: {
+; O1-NEXT: .reg .b16 %rs<4>;
+; O1-NEXT: .reg .b32 %r<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b16 %rs1, [t8_param_0];
+; O1-NEXT: ld.param.b16 %rs2, [t8_param_1];
+; O1-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O1-NEXT: cvt.s32.s16 %r1, %rs3;
+; O1-NEXT: st.param.b32 [func_retval0], %r1;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t8(
+; O0: {
+; O0-NEXT: .reg .b16 %rs<4>;
+; O0-NEXT: .reg .b32 %r<2>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b16 %rs2, [t8_param_1];
+; O0-NEXT: ld.param.b16 %rs1, [t8_param_0];
+; O0-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O0-NEXT: cvt.s32.s16 %r1, %rs3;
+; O0-NEXT: st.param.b32 [func_retval0], %r1;
+; O0-NEXT: ret;
+ %mul = mul i16 %a, %b
+ %sext = sext i16 %mul to i32
+ ret i32 %sext
+}
+
+define i64 @t9(i32 %a, i32 %b) {
+;
+; O1-LABEL: t9(
+; O1: {
+; O1-NEXT: .reg .b32 %r<4>;
+; O1-NEXT: .reg .b64 %rd<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b32 %r1, [t9_param_0];
+; O1-NEXT: ld.param.b32 %r2, [t9_param_1];
+; O1-NEXT: mul.lo.s32 %r3, %r1, %r2;
+; O1-NEXT: cvt.u64.u32 %rd1, %r3;
+; O1-NEXT: st.param.b64 [func_retval0], %rd1;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t9(
+; O0: {
+; O0-NEXT: .reg .b32 %r<4>;
+; O0-NEXT: .reg .b64 %rd<2>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b32 %r2, [t9_param_1];
+; O0-NEXT: ld.param.b32 %r1, [t9_param_0];
+; O0-NEXT: mul.lo.s32 %r3, %r1, %r2;
+; O0-NEXT: cvt.u64.u32 %rd1, %r3;
+; O0-NEXT: st.param.b64 [func_retval0], %rd1;
+; O0-NEXT: ret;
+ %mul = mul i32 %a, %b
+ %zext = zext i32 %mul to i64
+ ret i64 %zext
+}
+
+define i64 @t10(i32 %a, i32 %b) {
+;
+; O1-LABEL: t10(
+; O1: {
+; O1-NEXT: .reg .b32 %r<4>;
+; O1-NEXT: .reg .b64 %rd<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b32 %r1, [t10_param_0];
+; O1-NEXT: ld.param.b32 %r2, [t10_param_1];
+; O1-NEXT: mul.lo.s32 %r3, %r1, %r2;
+; O1-NEXT: cvt.s64.s32 %rd1, %r3;
+; O1-NEXT: st.param.b64 [func_retval0], %rd1;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t10(
+; O0: {
+; O0-NEXT: .reg .b32 %r<4>;
+; O0-NEXT: .reg .b64 %rd<2>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b32 %r2, [t10_param_1];
+; O0-NEXT: ld.param.b32 %r1, [t10_param_0];
+; O0-NEXT: mul.lo.s32 %r3, %r1, %r2;
+; O0-NEXT: cvt.s64.s32 %rd1, %r3;
+; O0-NEXT: st.param.b64 [func_retval0], %rd1;
+; O0-NEXT: ret;
+ %mul = mul i32 %a, %b
+ %sext = sext i32 %mul to i64
+ ret i64 %sext
+}
+
+define i32 @t11(i16 %a, i16 %b) {
+;
+; O1-LABEL: t11(
+; O1: {
+; O1-NEXT: .reg .b16 %rs<4>;
+; O1-NEXT: .reg .b32 %r<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b16 %rs1, [t11_param_0];
+; O1-NEXT: ld.param.b16 %rs2, [t11_param_1];
+; O1-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O1-NEXT: cvt.u32.u16 %r1, %rs3;
+; O1-NEXT: st.param.b32 [func_retval0], %r1;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t11(
+; O0: {
+; O0-NEXT: .reg .b16 %rs<4>;
+; O0-NEXT: .reg .b32 %r<2>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b16 %rs2, [t11_param_1];
+; O0-NEXT: ld.param.b16 %rs1, [t11_param_0];
+; O0-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O0-NEXT: cvt.u32.u16 %r1, %rs3;
+; O0-NEXT: st.param.b32 [func_retval0], %r1;
+; O0-NEXT: ret;
+ %mul = mul nsw i16 %a, %b
+ %zext = zext i16 %mul to i32
+ ret i32 %zext
+}
+
+define i32 @t12(i16 %a, i16 %b) {
+;
+; O1-LABEL: t12(
+; O1: {
+; O1-NEXT: .reg .b16 %rs<3>;
+; O1-NEXT: .reg .b32 %r<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b16 %rs1, [t12_param_0];
+; O1-NEXT: ld.param.b16 %rs2, [t12_param_1];
+; O1-NEXT: mul.wide.s16 %r1, %rs1, %rs2;
+; O1-NEXT: st.param.b32 [func_retval0], %r1;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t12(
+; O0: {
+; O0-NEXT: .reg .b16 %rs<4>;
+; O0-NEXT: .reg .b32 %r<2>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b16 %rs2, [t12_param_1];
+; O0-NEXT: ld.param.b16 %rs1, [t12_param_0];
+; O0-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O0-NEXT: cvt.s32.s16 %r1, %rs3;
+; O0-NEXT: st.param.b32 [func_retval0], %r1;
+; O0-NEXT: ret;
+ %mul = mul nsw i16 %a, %b
+ %sext = sext i16 %mul to i32
+ ret i32 %sext
+}
+
+define i64 @t13(i32 %a, i32 %b) {
+;
+; O1-LABEL: t13(
+; O1: {
+; O1-NEXT: .reg .b32 %r<4>;
+; O1-NEXT: .reg .b64 %rd<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b32 %r1, [t13_param_0];
+; O1-NEXT: ld.param.b32 %r2, [t13_param_1];
+; O1-NEXT: mul.lo.s32 %r3, %r1, %r2;
+; O1-NEXT: cvt.u64.u32 %rd1, %r3;
+; O1-NEXT: st.param.b64 [func_retval0], %rd1;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t13(
+; O0: {
+; O0-NEXT: .reg .b32 %r<4>;
+; O0-NEXT: .reg .b64 %rd<2>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b32 %r2, [t13_param_1];
+; O0-NEXT: ld.param.b32 %r1, [t13_param_0];
+; O0-NEXT: mul.lo.s32 %r3, %r1, %r2;
+; O0-NEXT: cvt.u64.u32 %rd1, %r3;
+; O0-NEXT: st.param.b64 [func_retval0], %rd1;
+; O0-NEXT: ret;
+ %mul = mul nsw i32 %a, %b
+ %zext = zext i32 %mul to i64
+ ret i64 %zext
+}
+
+define i64 @t14(i32 %a, i32 %b) {
+;
+; O1-LABEL: t14(
+; O1: {
+; O1-NEXT: .reg .b32 %r<3>;
+; O1-NEXT: .reg .b64 %rd<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b32 %r1, [t14_param_0];
+; O1-NEXT: ld.param.b32 %r2, [t14_param_1];
+; O1-NEXT: mul.wide.s32 %rd1, %r1, %r2;
+; O1-NEXT: st.param.b64 [func_retval0], %rd1;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t14(
+; O0: {
+; O0-NEXT: .reg .b32 %r<4>;
+; O0-NEXT: .reg .b64 %rd<2>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b32 %r2, [t14_param_1];
+; O0-NEXT: ld.param.b32 %r1, [t14_param_0];
+; O0-NEXT: mul.lo.s32 %r3, %r1, %r2;
+; O0-NEXT: cvt.s64.s32 %rd1, %r3;
+; O0-NEXT: st.param.b64 [func_retval0], %rd1;
+; O0-NEXT: ret;
+ %mul = mul nsw i32 %a, %b
+ %sext = sext i32 %mul to i64
+ ret i64 %sext
+}
+
+define i32 @t15(i16 %a, i16 %b) {
+;
+; O1-LABEL: t15(
+; O1: {
+; O1-NEXT: .reg .b16 %rs<3>;
+; O1-NEXT: .reg .b32 %r<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b16 %rs1, [t15_param_0];
+; O1-NEXT: ld.param.b16 %rs2, [t15_param_1];
+; O1-NEXT: mul.wide.u16 %r1, %rs1, %rs2;
+; O1-NEXT: st.param.b32 [func_retval0], %r1;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t15(
+; O0: {
+; O0-NEXT: .reg .b16 %rs<4>;
+; O0-NEXT: .reg .b32 %r<2>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b16 %rs2, [t15_param_1];
+; O0-NEXT: ld.param.b16 %rs1, [t15_param_0];
+; O0-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O0-NEXT: cvt.u32.u16 %r1, %rs3;
+; O0-NEXT: st.param.b32 [func_retval0], %r1;
+; O0-NEXT: ret;
+ %mul = mul nuw i16 %a, %b
+ %zext = zext i16 %mul to i32
+ ret i32 %zext
+}
+
+define i32 @t16(i16 %a, i16 %b) {
+;
+; O1-LABEL: t16(
+; O1: {
+; O1-NEXT: .reg .b16 %rs<4>;
+; O1-NEXT: .reg .b32 %r<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b16 %rs1, [t16_param_0];
+; O1-NEXT: ld.param.b16 %rs2, [t16_param_1];
+; O1-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O1-NEXT: cvt.s32.s16 %r1, %rs3;
+; O1-NEXT: st.param.b32 [func_retval0], %r1;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t16(
+; O0: {
+; O0-NEXT: .reg .b16 %rs<4>;
+; O0-NEXT: .reg .b32 %r<2>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b16 %rs2, [t16_param_1];
+; O0-NEXT: ld.param.b16 %rs1, [t16_param_0];
+; O0-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O0-NEXT: cvt.s32.s16 %r1, %rs3;
+; O0-NEXT: st.param.b32 [func_retval0], %r1;
+; O0-NEXT: ret;
+ %mul = mul nuw i16 %a, %b
+ %sext = sext i16 %mul to i32
+ ret i32 %sext
+}
+
+define i64 @t17(i32 %a, i32 %b) {
+;
+; O1-LABEL: t17(
+; O1: {
+; O1-NEXT: .reg .b32 %r<3>;
+; O1-NEXT: .reg .b64 %rd<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b32 %r1, [t17_param_0];
+; O1-NEXT: ld.param.b32 %r2, [t17_param_1];
+; O1-NEXT: mul.wide.u32 %rd1, %r1, %r2;
+; O1-NEXT: st.param.b64 [func_retval0], %rd1;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t17(
+; O0: {
+; O0-NEXT: .reg .b32 %r<4>;
+; O0-NEXT: .reg .b64 %rd<2>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b32 %r2, [t17_param_1];
+; O0-NEXT: ld.param.b32 %r1, [t17_param_0];
+; O0-NEXT: mul.lo.s32 %r3, %r1, %r2;
+; O0-NEXT: cvt.u64.u32 %rd1, %r3;
+; O0-NEXT: st.param.b64 [func_retval0], %rd1;
+; O0-NEXT: ret;
+ %mul = mul nuw i32 %a, %b
+ %zext = zext i32 %mul to i64
+ ret i64 %zext
+}
+
+define i64 @t18(i32 %a, i32 %b) {
+;
+; O1-LABEL: t18(
+; O1: {
+; O1-NEXT: .reg .b32 %r<4>;
+; O1-NEXT: .reg .b64 %rd<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b32 %r1, [t18_param_0];
+; O1-NEXT: ld.param.b32 %r2, [t18_param_1];
+; O1-NEXT: mul.lo.s32 %r3, %r1, %r2;
+; O1-NEXT: cvt.s64.s32 %rd1, %r3;
+; O1-NEXT: st.param.b64 [func_retval0], %rd1;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t18(
+; O0: {
+; O0-NEXT: .reg .b32 %r<4>;
+; O0-NEXT: .reg .b64 %rd<2>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b32 %r2, [t18_param_1];
+; O0-NEXT: ld.param.b32 %r1, [t18_param_0];
+; O0-NEXT: mul.lo.s32 %r3, %r1, %r2;
+; O0-NEXT: cvt.s64.s32 %rd1, %r3;
+; O0-NEXT: st.param.b64 [func_retval0], %rd1;
+; O0-NEXT: ret;
+ %mul = mul nuw i32 %a, %b
+ %sext = sext i32 %mul to i64
+ ret i64 %sext
+}
+
+define i32 @t19(i16 %a, i16 %b) {
+;
+; O1-LABEL: t19(
+; O1: {
+; O1-NEXT: .reg .b16 %rs<4>;
+; O1-NEXT: .reg .b32 %r<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b16 %rs1, [t19_param_0];
+; O1-NEXT: ld.param.b16 %rs2, [t19_param_1];
+; O1-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O1-NEXT: cvt.u32.u16 %r1, %rs3;
+; O1-NEXT: st.param.b32 [func_retval0], %r1;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t19(
+; O0: {
+; O0-NEXT: .reg .b16 %rs<4>;
+; O0-NEXT: .reg .b32 %r<2>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b16 %rs2, [t19_param_1];
+; O0-NEXT: ld.param.b16 %rs1, [t19_param_0];
+; O0-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O0-NEXT: cvt.u32.u16 %r1, %rs3;
+; O0-NEXT: st.param.b32 [func_retval0], %r1;
+; O0-NEXT: ret;
+ %mul = mul i16 %a, %b
+ %zext = zext i16 %mul to i32
+ ret i32 %zext
+}
+
+define i32 @t20(i16 %a) {
+;
+; CHECK-LABEL: t20(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b16 %rs1, [t20_param_0];
+; CHECK-NEXT: shl.b16 %rs2, %rs1, 4;
+; CHECK-NEXT: cvt.s32.s16 %r1, %rs2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+ %mul = shl i16 %a, 4
+ %sext = sext i16 %mul to i32
+ ret i32 %sext
+}
+
+define i64 @t21(i32 %a) {
+;
+; CHECK-LABEL: t21(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [t21_param_0];
+; CHECK-NEXT: shl.b32 %r2, %r1, 4;
+; CHECK-NEXT: cvt.u64.u32 %rd1, %r2;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd1;
+; CHECK-NEXT: ret;
+ %mul = shl i32 %a, 4
+ %zext = zext i32 %mul to i64
+ ret i64 %zext
+}
+
+define i64 @t22(i32 %a) {
+;
+; CHECK-LABEL: t22(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [t22_param_0];
+; CHECK-NEXT: shl.b32 %r2, %r1, 4;
+; CHECK-NEXT: cvt.s64.s32 %rd1, %r2;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd1;
+; CHECK-NEXT: ret;
+ %mul = shl i32 %a, 4
+ %sext = sext i32 %mul to i64
+ ret i64 %sext
+}
+
+define i32 @t23(i16 %a, i16 %b) {
+;
+; CHECK-LABEL: t23(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b16 %rs1, [t23_param_0];
+; CHECK-NEXT: shl.b16 %rs2, %rs1, 4;
+; CHECK-NEXT: cvt.u32.u16 %r1, %rs2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+ %mul = shl nsw i16 %a, 4
+ %zext = zext i16 %mul to i32
+ ret i32 %zext
+}
+
+define i32 @t24(i16 %a, i16 %b) {
+;
+; O1-LABEL: t24(
+; O1: {
+; O1-NEXT: .reg .b16 %rs<2>;
+; O1-NEXT: .reg .b32 %r<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b16 %rs1, [t24_param_0];
+; O1-NEXT: mul.wide.s16 %r1, %rs1, 16;
+; O1-NEXT: st.param.b32 [func_retval0], %r1;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t24(
+; O0: {
+; O0-NEXT: .reg .b16 %rs<3>;
+; O0-NEXT: .reg .b32 %r<2>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b16 %rs1, [t24_param_0];
+; O0-NEXT: shl.b16 %rs2, %rs1, 4;
+; O0-NEXT: cvt.s32.s16 %r1, %rs2;
+; O0-NEXT: st.param.b32 [func_retval0], %r1;
+; O0-NEXT: ret;
+ %mul = shl nsw i16 %a, 4
+ %sext = sext i16 %mul to i32
+ ret i32 %sext
+}
+
+define i64 @t25(i32 %a) {
+;
+; CHECK-LABEL: t25(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [t25_param_0];
+; CHECK-NEXT: shl.b32 %r2, %r1, 4;
+; CHECK-NEXT: cvt.u64.u32 %rd1, %r2;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd1;
+; CHECK-NEXT: ret;
+ %mul = shl nsw i32 %a, 4
+ %zext = zext i32 %mul to i64
+ ret i64 %zext
+}
+
+define i64 @t26(i32 %a) {
+;
+; O1-LABEL: t26(
+; O1: {
+; O1-NEXT: .reg .b32 %r<2>;
+; O1-NEXT: .reg .b64 %rd<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b32 %r1, [t26_param_0];
+; O1-NEXT: mul.wide.s32 %rd1, %r1, 16;
+; O1-NEXT: st.param.b64 [func_retval0], %rd1;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t26(
+; O0: {
+; O0-NEXT: .reg .b32 %r<3>;
+; O0-NEXT: .reg .b64 %rd<2>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b32 %r1, [t26_param_0];
+; O0-NEXT: shl.b32 %r2, %r1, 4;
+; O0-NEXT: cvt.s64.s32 %rd1, %r2;
+; O0-NEXT: st.param.b64 [func_retval0], %rd1;
+; O0-NEXT: ret;
+ %mul = shl nsw i32 %a, 4
+ %sext = sext i32 %mul to i64
+ ret i64 %sext
+}
+
+define i32 @t27(i16 %a, i16 %b) {
+;
+; O1-LABEL: t27(
+; O1: {
+; O1-NEXT: .reg .b16 %rs<2>;
+; O1-NEXT: .reg .b32 %r<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b16 %rs1, [t27_param_0];
+; O1-NEXT: mul.wide.u16 %r1, %rs1, 16;
+; O1-NEXT: st.param.b32 [func_retval0], %r1;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t27(
+; O0: {
+; O0-NEXT: .reg .b16 %rs<3>;
+; O0-NEXT: .reg .b32 %r<2>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b16 %rs1, [t27_param_0];
+; O0-NEXT: shl.b16 %rs2, %rs1, 4;
+; O0-NEXT: cvt.u32.u16 %r1, %rs2;
+; O0-NEXT: st.param.b32 [func_retval0], %r1;
+; O0-NEXT: ret;
+ %mul = shl nuw i16 %a, 4
+ %zext = zext i16 %mul to i32
+ ret i32 %zext
+}
+
+define i32 @t28(i16 %a, i16 %b) {
+;
+; CHECK-LABEL: t28(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b16 %rs1, [t28_param_0];
+; CHECK-NEXT: shl.b16 %rs2, %rs1, 4;
+; CHECK-NEXT: cvt.s32.s16 %r1, %rs2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+ %mul = shl nuw i16 %a, 4
+ %sext = sext i16 %mul to i32
+ ret i32 %sext
+}
+
+define i64 @t29(i32 %a) {
+;
+; O1-LABEL: t29(
+; O1: {
+; O1-NEXT: .reg .b32 %r<2>;
+; O1-NEXT: .reg .b64 %rd<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b32 %r1, [t29_param_0];
+; O1-NEXT: mul.wide.u32 %rd1, %r1, 16;
+; O1-NEXT: st.param.b64 [func_retval0], %rd1;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t29(
+; O0: {
+; O0-NEXT: .reg .b32 %r<3>;
+; O0-NEXT: .reg .b64 %rd<2>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b32 %r1, [t29_param_0];
+; O0-NEXT: shl.b32 %r2, %r1, 4;
+; O0-NEXT: cvt.u64.u32 %rd1, %r2;
+; O0-NEXT: st.param.b64 [func_retval0], %rd1;
+; O0-NEXT: ret;
+ %mul = shl nuw i32 %a, 4
+ %zext = zext i32 %mul to i64
+ ret i64 %zext
+}
+
+define i64 @t30(i32 %a) {
+;
+; CHECK-LABEL: t30(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [t30_param_0];
+; CHECK-NEXT: shl.b32 %r2, %r1, 4;
+; CHECK-NEXT: cvt.s64.s32 %rd1, %r2;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd1;
+; CHECK-NEXT: ret;
+ %mul = shl nuw i32 %a, 4
+ %sext = sext i32 %mul to i64
+ ret i64 %sext
+}
+
+define i64 @t31(i32 %a, i32 %b) {
+;
+; O1-LABEL: t31(
+; O1: {
+; O1-NEXT: .reg .b32 %r<4>;
+; O1-NEXT: .reg .b64 %rd<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b32 %r1, [t31_param_0];
+; O1-NEXT: ld.param.b32 %r2, [t31_param_1];
+; O1-NEXT: shl.b32 %r3, %r1, %r2;
+; O1-NEXT: cvt.s64.s32 %rd1, %r3;
+; O1-NEXT: st.param.b64 [func_retval0], %rd1;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t31(
+; O0: {
+; O0-NEXT: .reg .b32 %r<4>;
+; O0-NEXT: .reg .b64 %rd<2>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b32 %r2, [t31_param_1];
+; O0-NEXT: ld.param.b32 %r1, [t31_param_0];
+; O0-NEXT: shl.b32 %r3, %r1, %r2;
+; O0-NEXT: cvt.s64.s32 %rd1, %r3;
+; O0-NEXT: st.param.b64 [func_retval0], %rd1;
+; O0-NEXT: ret;
+ %mul = shl nuw i32 %a, %b
+ %sext = sext i32 %mul to i64
+ ret i64 %sext
+}
+
+define i32 @t32(i16 %a, i16 %b, i32 %c) {
+;
+; O1-LABEL: t32(
+; O1: {
+; O1-NEXT: .reg .b16 %rs<3>;
+; O1-NEXT: .reg .b32 %r<3>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b16 %rs1, [t32_param_0];
+; O1-NEXT: ld.param.b16 %rs2, [t32_param_1];
+; O1-NEXT: ld.param.b32 %r1, [t32_param_2];
+; O1-NEXT: mad.wide.s16 %r2, %rs1, %rs2, %r1;
+; O1-NEXT: st.param.b32 [func_retval0], %r2;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t32(
+; O0: {
+; O0-NEXT: .reg .b16 %rs<4>;
+; O0-NEXT: .reg .b32 %r<4>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b32 %r1, [t32_param_2];
+; O0-NEXT: ld.param.b16 %rs2, [t32_param_1];
+; O0-NEXT: ld.param.b16 %rs1, [t32_param_0];
+; O0-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O0-NEXT: cvt.s32.s16 %r2, %rs3;
+; O0-NEXT: add.s32 %r3, %r1, %r2;
+; O0-NEXT: st.param.b32 [func_retval0], %r3;
+; O0-NEXT: ret;
+ %mul = mul nsw i16 %a, %b
+ %sext = sext i16 %mul to i32
+ %add = add i32 %c, %sext
+ ret i32 %add
+}
+
+define i32 @t33(i16 %a, i16 %b, i32 %c) {
+;
+; O1-LABEL: t33(
+; O1: {
+; O1-NEXT: .reg .b16 %rs<3>;
+; O1-NEXT: .reg .b32 %r<3>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b16 %rs1, [t33_param_0];
+; O1-NEXT: ld.param.b16 %rs2, [t33_param_1];
+; O1-NEXT: ld.param.b32 %r1, [t33_param_2];
+; O1-NEXT: mad.wide.s16 %r2, %rs1, %rs2, %r1;
+; O1-NEXT: st.param.b32 [func_retval0], %r2;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t33(
+; O0: {
+; O0-NEXT: .reg .b16 %rs<4>;
+; O0-NEXT: .reg .b32 %r<4>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b32 %r1, [t33_param_2];
+; O0-NEXT: ld.param.b16 %rs2, [t33_param_1];
+; O0-NEXT: ld.param.b16 %rs1, [t33_param_0];
+; O0-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O0-NEXT: cvt.s32.s16 %r2, %rs3;
+; O0-NEXT: add.s32 %r3, %r1, %r2;
+; O0-NEXT: st.param.b32 [func_retval0], %r3;
+; O0-NEXT: ret;
+ %mul = mul nsw i16 %a, %b
+ %sext = sext i16 %mul to i32
+ %add = add i32 %c, %sext
+ ret i32 %add
+}
+
+define i32 @t34(i16 %a, i16 %b) {
+;
+; O1-LABEL: t34(
+; O1: {
+; O1-NEXT: .reg .b16 %rs<3>;
+; O1-NEXT: .reg .b32 %r<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b16 %rs1, [t34_param_0];
+; O1-NEXT: ld.param.b16 %rs2, [t34_param_1];
+; O1-NEXT: mad.wide.s16 %r1, %rs1, %rs2, 1;
+; O1-NEXT: st.param.b32 [func_retval0], %r1;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t34(
+; O0: {
+; O0-NEXT: .reg .b16 %rs<4>;
+; O0-NEXT: .reg .b32 %r<3>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b16 %rs2, [t34_param_1];
+; O0-NEXT: ld.param.b16 %rs1, [t34_param_0];
+; O0-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O0-NEXT: cvt.s32.s16 %r1, %rs3;
+; O0-NEXT: add.s32 %r2, %r1, 1;
+; O0-NEXT: st.param.b32 [func_retval0], %r2;
+; O0-NEXT: ret;
+ %mul = mul nsw i16 %a, %b
+ %sext = sext i16 %mul to i32
+ %add = add i32 1, %sext
+ ret i32 %add
+}
+
+define i32 @t35(i16 %a, i32 %c) {
+;
+; O1-LABEL: t35(
+; O1: {
+; O1-NEXT: .reg .b16 %rs<2>;
+; O1-NEXT: .reg .b32 %r<3>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b16 %rs1, [t35_param_0];
+; O1-NEXT: ld.param.b32 %r1, [t35_param_1];
+; O1-NEXT: mad.wide.s16 %r2, %rs1, 3, %r1;
+; O1-NEXT: st.param.b32 [func_retval0], %r2;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t35(
+; O0: {
+; O0-NEXT: .reg .b16 %rs<3>;
+; O0-NEXT: .reg .b32 %r<4>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b32 %r1, [t35_param_1];
+; O0-NEXT: ld.param.b16 %rs1, [t35_param_0];
+; O0-NEXT: mul.lo.s16 %rs2, %rs1, 3;
+; O0-NEXT: cvt.s32.s16 %r2, %rs2;
+; O0-NEXT: add.s32 %r3, %r1, %r2;
+; O0-NEXT: st.param.b32 [func_retval0], %r3;
+; O0-NEXT: ret;
+ %mul = mul nsw i16 %a, 3
+ %sext = sext i16 %mul to i32
+ %add = add i32 %c, %sext
+ ret i32 %add
+}
+
+define i32 @t36(i16 %a, i32 %c) {
+;
+; O1-LABEL: t36(
+; O1: {
+; O1-NEXT: .reg .b16 %rs<2>;
+; O1-NEXT: .reg .b32 %r<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b16 %rs1, [t36_param_0];
+; O1-NEXT: mad.wide.s16 %r1, %rs1, 3, 5;
+; O1-NEXT: st.param.b32 [func_retval0], %r1;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t36(
+; O0: {
+; O0-NEXT: .reg .b16 %rs<3>;
+; O0-NEXT: .reg .b32 %r<3>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b16 %rs1, [t36_param_0];
+; O0-NEXT: mul.lo.s16 %rs2, %rs1, 3;
+; O0-NEXT: cvt.s32.s16 %r1, %rs2;
+; O0-NEXT: add.s32 %r2, %r1, 5;
+; O0-NEXT: st.param.b32 [func_retval0], %r2;
+; O0-NEXT: ret;
+ %mul = mul nsw i16 %a, 3
+ %sext = sext i16 %mul to i32
+ %add = add i32 5, %sext
+ ret i32 %add
+}
+
+define i32 @t37(i16 %a, i16 %b, i32 %c) {
+;
+; O1-LABEL: t37(
+; O1: {
+; O1-NEXT: .reg .b16 %rs<3>;
+; O1-NEXT: .reg .b32 %r<3>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b16 %rs1, [t37_param_0];
+; O1-NEXT: ld.param.b16 %rs2, [t37_param_1];
+; O1-NEXT: ld.param.b32 %r1, [t37_param_2];
+; O1-NEXT: mad.wide.u16 %r2, %rs1, %rs2, %r1;
+; O1-NEXT: st.param.b32 [func_retval0], %r2;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t37(
+; O0: {
+; O0-NEXT: .reg .b16 %rs<4>;
+; O0-NEXT: .reg .b32 %r<4>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b32 %r1, [t37_param_2];
+; O0-NEXT: ld.param.b16 %rs2, [t37_param_1];
+; O0-NEXT: ld.param.b16 %rs1, [t37_param_0];
+; O0-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O0-NEXT: cvt.u32.u16 %r2, %rs3;
+; O0-NEXT: add.s32 %r3, %r1, %r2;
+; O0-NEXT: st.param.b32 [func_retval0], %r3;
+; O0-NEXT: ret;
+ %mul = mul nuw i16 %a, %b
+ %zext = zext i16 %mul to i32
+ %add = add i32 %c, %zext
+ ret i32 %add
+}
+
+define i32 @t38(i16 %a, i16 %b, i32 %c) {
+;
+; O1-LABEL: t38(
+; O1: {
+; O1-NEXT: .reg .b16 %rs<3>;
+; O1-NEXT: .reg .b32 %r<3>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b16 %rs1, [t38_param_0];
+; O1-NEXT: ld.param.b16 %rs2, [t38_param_1];
+; O1-NEXT: ld.param.b32 %r1, [t38_param_2];
+; O1-NEXT: mad.wide.u16 %r2, %rs1, %rs2, %r1;
+; O1-NEXT: st.param.b32 [func_retval0], %r2;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t38(
+; O0: {
+; O0-NEXT: .reg .b16 %rs<4>;
+; O0-NEXT: .reg .b32 %r<4>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b32 %r1, [t38_param_2];
+; O0-NEXT: ld.param.b16 %rs2, [t38_param_1];
+; O0-NEXT: ld.param.b16 %rs1, [t38_param_0];
+; O0-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O0-NEXT: cvt.u32.u16 %r2, %rs3;
+; O0-NEXT: add.s32 %r3, %r2, %r1;
+; O0-NEXT: st.param.b32 [func_retval0], %r3;
+; O0-NEXT: ret;
+ %mul = mul nuw i16 %a, %b
+ %zext = zext i16 %mul to i32
+ %add = add i32 %zext, %c
+ ret i32 %add
+}
+
+define i64 @t39(i16 %a, i16 %b) {
+; O1-LABEL: t39(
+; O1: {
+; O1-NEXT: .reg .b16 %rs<4>;
+; O1-NEXT: .reg .b64 %rd<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b16 %rs1, [t39_param_0];
+; O1-NEXT: ld.param.b16 %rs2, [t39_param_1];
+; O1-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O1-NEXT: cvt.u64.u16 %rd1, %rs3;
+; O1-NEXT: st.param.b64 [func_retval0], %rd1;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t39(
+; O0: {
+; O0-NEXT: .reg .b16 %rs<4>;
+; O0-NEXT: .reg .b64 %rd<2>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b16 %rs2, [t39_param_1];
+; O0-NEXT: ld.param.b16 %rs1, [t39_param_0];
+; O0-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O0-NEXT: cvt.u64.u16 %rd1, %rs3;
+; O0-NEXT: st.param.b64 [func_retval0], %rd1;
+; O0-NEXT: ret;
+ %mul = mul i16 %a, %b
+ %zext = zext i16 %mul to i64
+ ret i64 %zext
+}
+
+define i64 @t40(i16 %a, i16 %b) {
+; O1-LABEL: t40(
+; O1: {
+; O1-NEXT: .reg .b16 %rs<4>;
+; O1-NEXT: .reg .b64 %rd<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b16 %rs1, [t40_param_0];
+; O1-NEXT: ld.param.b16 %rs2, [t40_param_1];
+; O1-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O1-NEXT: cvt.u64.u16 %rd1, %rs3;
+; O1-NEXT: st.param.b64 [func_retval0], %rd1;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t40(
+; O0: {
+; O0-NEXT: .reg .b16 %rs<4>;
+; O0-NEXT: .reg .b64 %rd<2>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b16 %rs2, [t40_param_1];
+; O0-NEXT: ld.param.b16 %rs1, [t40_param_0];
+; O0-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O0-NEXT: cvt.u64.u16 %rd1, %rs3;
+; O0-NEXT: st.param.b64 [func_retval0], %rd1;
+; O0-NEXT: ret;
+ %mul = mul nuw i16 %a, %b
+ %zext = zext i16 %mul to i64
+ ret i64 %zext
+}
+
+define i64 @t41(i16 %a, i16 %b) {
+; O1-LABEL: t41(
+; O1: {
+; O1-NEXT: .reg .b16 %rs<4>;
+; O1-NEXT: .reg .b64 %rd<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b16 %rs1, [t41_param_0];
+; O1-NEXT: ld.param.b16 %rs2, [t41_param_1];
+; O1-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O1-NEXT: cvt.s64.s16 %rd1, %rs3;
+; O1-NEXT: st.param.b64 [func_retval0], %rd1;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t41(
+; O0: {
+; O0-NEXT: .reg .b16 %rs<4>;
+; O0-NEXT: .reg .b64 %rd<2>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b16 %rs2, [t41_param_1];
+; O0-NEXT: ld.param.b16 %rs1, [t41_param_0];
+; O0-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O0-NEXT: cvt.s64.s16 %rd1, %rs3;
+; O0-NEXT: st.param.b64 [func_retval0], %rd1;
+; O0-NEXT: ret;
+ %mul = mul nsw i16 %a, %b
+ %sext = sext i16 %mul to i64
+ ret i64 %sext
+}
+
+define i32 @t42(i16 %a, i16 %b, ptr %ptr) {
+; O1-LABEL: t42(
+; O1: {
+; O1-NEXT: .reg .b16 %rs<4>;
+; O1-NEXT: .reg .b32 %r<2>;
+; O1-NEXT: .reg .b64 %rd<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b16 %rs1, [t42_param_0];
+; O1-NEXT: ld.param.b16 %rs2, [t42_param_1];
+; O1-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O1-NEXT: ld.param.b64 %rd1, [t42_param_2];
+; O1-NEXT: st.b16 [%rd1], %rs3;
+; O1-NEXT: cvt.u32.u16 %r1, %rs3;
+; O1-NEXT: st.param.b32 [func_retval0], %r1;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t42(
+; O0: {
+; O0-NEXT: .reg .b16 %rs<4>;
+; O0-NEXT: .reg .b32 %r<2>;
+; O0-NEXT: .reg .b64 %rd<2>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b64 %rd1, [t42_param_2];
+; O0-NEXT: ld.param.b16 %rs2, [t42_param_1];
+; O0-NEXT: ld.param.b16 %rs1, [t42_param_0];
+; O0-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O0-NEXT: st.b16 [%rd1], %rs3;
+; O0-NEXT: cvt.u32.u16 %r1, %rs3;
+; O0-NEXT: st.param.b32 [func_retval0], %r1;
+; O0-NEXT: ret;
+ %mul = mul nuw i16 %a, %b
+ store i16 %mul, ptr %ptr
+ %zext = zext i16 %mul to i32
+ ret i32 %zext
+}
+
+define i32 @t43(i16 %a, i16 %b, i32 %c, ptr %ptr) {
+; O1-LABEL: t43(
+; O1: {
+; O1-NEXT: .reg .b16 %rs<4>;
+; O1-NEXT: .reg .b32 %r<4>;
+; O1-NEXT: .reg .b64 %rd<2>;
+; O1-EMPTY:
+; O1-NEXT: // %bb.0:
+; O1-NEXT: ld.param.b16 %rs1, [t43_param_0];
+; O1-NEXT: ld.param.b16 %rs2, [t43_param_1];
+; O1-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O1-NEXT: ld.param.b64 %rd1, [t43_param_3];
+; O1-NEXT: st.b16 [%rd1], %rs3;
+; O1-NEXT: ld.param.b32 %r1, [t43_param_2];
+; O1-NEXT: cvt.u32.u16 %r2, %rs3;
+; O1-NEXT: add.s32 %r3, %r2, %r1;
+; O1-NEXT: st.param.b32 [func_retval0], %r3;
+; O1-NEXT: ret;
+;
+; O0-LABEL: t43(
+; O0: {
+; O0-NEXT: .reg .b16 %rs<4>;
+; O0-NEXT: .reg .b32 %r<4>;
+; O0-NEXT: .reg .b64 %rd<2>;
+; O0-EMPTY:
+; O0-NEXT: // %bb.0:
+; O0-NEXT: ld.param.b64 %rd1, [t43_param_3];
+; O0-NEXT: ld.param.b32 %r1, [t43_param_2];
+; O0-NEXT: ld.param.b16 %rs2, [t43_param_1];
+; O0-NEXT: ld.param.b16 %rs1, [t43_param_0];
+; O0-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; O0-NEXT: st.b16 [%rd1], %rs3;
+; O0-NEXT: cvt.u32.u16 %r2, %rs3;
+; O0-NEXT: add.s32 %r3, %r2, %r1;
+; O0-NEXT: st.param.b32 [func_retval0], %r3;
+; O0-NEXT: ret;
+ %mul = mul nuw i16 %a, %b
+ store i16 %mul, ptr %ptr
+ %zext = zext i16 %mul to i32
+ %add = add i32 %zext, %c
+ ret i32 %add
+}
diff --git a/llvm/test/CodeGen/NVPTX/fold-movs.ll b/llvm/test/CodeGen/NVPTX/fold-movs.ll
new file mode 100644
index 0000000..6ee0fb2
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/fold-movs.ll
@@ -0,0 +1,38 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mcpu=sm_100 -mattr=+ptx88 -O3 -disable-post-ra \
+; RUN: -frame-pointer=all -verify-machineinstrs \
+; RUN: | FileCheck %s --check-prefixes=CHECK-F32X2
+; RUN: %if ptxas-12.7 %{ \
+; RUN: llc < %s -mcpu=sm_100 -mattr=+ptx88 -O3 -disable-post-ra \
+; RUN: -frame-pointer=all -verify-machineinstrs | %ptxas-verify -arch=sm_100 \
+; RUN: %}
+target triple = "nvptx64-nvidia-cuda"
+
+; Since fdiv doesn't support f32x2, this will create BUILD_VECTORs that will be
+; folded into the store, turning it into st.global.v8.b32.
+define void @writevec(<8 x float> %v1, <8 x float> %v2, ptr addrspace(1) %p) {
+; CHECK-F32X2-LABEL: writevec(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .b32 %r<25>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<2>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [writevec_param_0];
+; CHECK-F32X2-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [writevec_param_0+16];
+; CHECK-F32X2-NEXT: ld.param.v4.b32 {%r9, %r10, %r11, %r12}, [writevec_param_1+16];
+; CHECK-F32X2-NEXT: div.rn.f32 %r13, %r8, %r12;
+; CHECK-F32X2-NEXT: div.rn.f32 %r14, %r7, %r11;
+; CHECK-F32X2-NEXT: div.rn.f32 %r15, %r6, %r10;
+; CHECK-F32X2-NEXT: div.rn.f32 %r16, %r5, %r9;
+; CHECK-F32X2-NEXT: ld.param.v4.b32 {%r17, %r18, %r19, %r20}, [writevec_param_1];
+; CHECK-F32X2-NEXT: div.rn.f32 %r21, %r4, %r20;
+; CHECK-F32X2-NEXT: div.rn.f32 %r22, %r3, %r19;
+; CHECK-F32X2-NEXT: div.rn.f32 %r23, %r2, %r18;
+; CHECK-F32X2-NEXT: div.rn.f32 %r24, %r1, %r17;
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [writevec_param_2];
+; CHECK-F32X2-NEXT: st.global.v8.b32 [%rd1], {%r24, %r23, %r22, %r21, %r16, %r15, %r14, %r13};
+; CHECK-F32X2-NEXT: ret;
+ %v = fdiv <8 x float> %v1, %v2
+ store <8 x float> %v, ptr addrspace(1) %p, align 32
+ ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
index 06c2cc8..26336b8 100644
--- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
@@ -343,61 +343,77 @@ define <4 x i8> @test_smax(<4 x i8> %a, <4 x i8> %b) #0 {
; O0-LABEL: test_smax(
; O0: {
; O0-NEXT: .reg .pred %p<5>;
-; O0-NEXT: .reg .b32 %r<18>;
+; O0-NEXT: .reg .b32 %r<26>;
; O0-EMPTY:
; O0-NEXT: // %bb.0:
; O0-NEXT: ld.param.b32 %r2, [test_smax_param_1];
; O0-NEXT: ld.param.b32 %r1, [test_smax_param_0];
-; O0-NEXT: prmt.b32 %r3, %r2, 0, 0x7770U;
-; O0-NEXT: prmt.b32 %r4, %r1, 0, 0x7770U;
+; O0-NEXT: prmt.b32 %r3, %r2, 0, 0x8880U;
+; O0-NEXT: prmt.b32 %r4, %r1, 0, 0x8880U;
; O0-NEXT: setp.gt.s32 %p1, %r4, %r3;
-; O0-NEXT: prmt.b32 %r5, %r2, 0, 0x7771U;
-; O0-NEXT: prmt.b32 %r6, %r1, 0, 0x7771U;
+; O0-NEXT: prmt.b32 %r5, %r2, 0, 0x9991U;
+; O0-NEXT: prmt.b32 %r6, %r1, 0, 0x9991U;
; O0-NEXT: setp.gt.s32 %p2, %r6, %r5;
-; O0-NEXT: prmt.b32 %r7, %r2, 0, 0x7772U;
-; O0-NEXT: prmt.b32 %r8, %r1, 0, 0x7772U;
+; O0-NEXT: prmt.b32 %r7, %r2, 0, 0xaaa2U;
+; O0-NEXT: prmt.b32 %r8, %r1, 0, 0xaaa2U;
; O0-NEXT: setp.gt.s32 %p3, %r8, %r7;
-; O0-NEXT: prmt.b32 %r9, %r2, 0, 0x7773U;
-; O0-NEXT: prmt.b32 %r10, %r1, 0, 0x7773U;
+; O0-NEXT: prmt.b32 %r9, %r2, 0, 0xbbb3U;
+; O0-NEXT: prmt.b32 %r10, %r1, 0, 0xbbb3U;
; O0-NEXT: setp.gt.s32 %p4, %r10, %r9;
-; O0-NEXT: selp.b32 %r11, %r10, %r9, %p4;
-; O0-NEXT: selp.b32 %r12, %r8, %r7, %p3;
-; O0-NEXT: prmt.b32 %r13, %r12, %r11, 0x3340U;
-; O0-NEXT: selp.b32 %r14, %r6, %r5, %p2;
-; O0-NEXT: selp.b32 %r15, %r4, %r3, %p1;
-; O0-NEXT: prmt.b32 %r16, %r15, %r14, 0x3340U;
-; O0-NEXT: prmt.b32 %r17, %r16, %r13, 0x5410U;
-; O0-NEXT: st.param.b32 [func_retval0], %r17;
+; O0-NEXT: prmt.b32 %r11, %r2, 0, 0x7770U;
+; O0-NEXT: prmt.b32 %r12, %r2, 0, 0x7771U;
+; O0-NEXT: prmt.b32 %r13, %r2, 0, 0x7772U;
+; O0-NEXT: prmt.b32 %r14, %r2, 0, 0x7773U;
+; O0-NEXT: prmt.b32 %r15, %r1, 0, 0x7773U;
+; O0-NEXT: selp.b32 %r16, %r15, %r14, %p4;
+; O0-NEXT: prmt.b32 %r17, %r1, 0, 0x7772U;
+; O0-NEXT: selp.b32 %r18, %r17, %r13, %p3;
+; O0-NEXT: prmt.b32 %r19, %r18, %r16, 0x3340U;
+; O0-NEXT: prmt.b32 %r20, %r1, 0, 0x7771U;
+; O0-NEXT: selp.b32 %r21, %r20, %r12, %p2;
+; O0-NEXT: prmt.b32 %r22, %r1, 0, 0x7770U;
+; O0-NEXT: selp.b32 %r23, %r22, %r11, %p1;
+; O0-NEXT: prmt.b32 %r24, %r23, %r21, 0x3340U;
+; O0-NEXT: prmt.b32 %r25, %r24, %r19, 0x5410U;
+; O0-NEXT: st.param.b32 [func_retval0], %r25;
; O0-NEXT: ret;
;
; O3-LABEL: test_smax(
; O3: {
; O3-NEXT: .reg .pred %p<5>;
-; O3-NEXT: .reg .b32 %r<18>;
+; O3-NEXT: .reg .b32 %r<26>;
; O3-EMPTY:
; O3-NEXT: // %bb.0:
; O3-NEXT: ld.param.b32 %r1, [test_smax_param_0];
; O3-NEXT: ld.param.b32 %r2, [test_smax_param_1];
-; O3-NEXT: prmt.b32 %r3, %r2, 0, 0x7770U;
-; O3-NEXT: prmt.b32 %r4, %r1, 0, 0x7770U;
+; O3-NEXT: prmt.b32 %r3, %r2, 0, 0x8880U;
+; O3-NEXT: prmt.b32 %r4, %r1, 0, 0x8880U;
; O3-NEXT: setp.gt.s32 %p1, %r4, %r3;
-; O3-NEXT: prmt.b32 %r5, %r2, 0, 0x7771U;
-; O3-NEXT: prmt.b32 %r6, %r1, 0, 0x7771U;
+; O3-NEXT: prmt.b32 %r5, %r2, 0, 0x9991U;
+; O3-NEXT: prmt.b32 %r6, %r1, 0, 0x9991U;
; O3-NEXT: setp.gt.s32 %p2, %r6, %r5;
-; O3-NEXT: prmt.b32 %r7, %r2, 0, 0x7772U;
-; O3-NEXT: prmt.b32 %r8, %r1, 0, 0x7772U;
+; O3-NEXT: prmt.b32 %r7, %r2, 0, 0xaaa2U;
+; O3-NEXT: prmt.b32 %r8, %r1, 0, 0xaaa2U;
; O3-NEXT: setp.gt.s32 %p3, %r8, %r7;
-; O3-NEXT: prmt.b32 %r9, %r2, 0, 0x7773U;
-; O3-NEXT: prmt.b32 %r10, %r1, 0, 0x7773U;
+; O3-NEXT: prmt.b32 %r9, %r2, 0, 0xbbb3U;
+; O3-NEXT: prmt.b32 %r10, %r1, 0, 0xbbb3U;
; O3-NEXT: setp.gt.s32 %p4, %r10, %r9;
-; O3-NEXT: selp.b32 %r11, %r10, %r9, %p4;
-; O3-NEXT: selp.b32 %r12, %r8, %r7, %p3;
-; O3-NEXT: prmt.b32 %r13, %r12, %r11, 0x3340U;
-; O3-NEXT: selp.b32 %r14, %r6, %r5, %p2;
-; O3-NEXT: selp.b32 %r15, %r4, %r3, %p1;
-; O3-NEXT: prmt.b32 %r16, %r15, %r14, 0x3340U;
-; O3-NEXT: prmt.b32 %r17, %r16, %r13, 0x5410U;
-; O3-NEXT: st.param.b32 [func_retval0], %r17;
+; O3-NEXT: prmt.b32 %r11, %r2, 0, 0x7770U;
+; O3-NEXT: prmt.b32 %r12, %r2, 0, 0x7771U;
+; O3-NEXT: prmt.b32 %r13, %r2, 0, 0x7772U;
+; O3-NEXT: prmt.b32 %r14, %r2, 0, 0x7773U;
+; O3-NEXT: prmt.b32 %r15, %r1, 0, 0x7773U;
+; O3-NEXT: selp.b32 %r16, %r15, %r14, %p4;
+; O3-NEXT: prmt.b32 %r17, %r1, 0, 0x7772U;
+; O3-NEXT: selp.b32 %r18, %r17, %r13, %p3;
+; O3-NEXT: prmt.b32 %r19, %r18, %r16, 0x3340U;
+; O3-NEXT: prmt.b32 %r20, %r1, 0, 0x7771U;
+; O3-NEXT: selp.b32 %r21, %r20, %r12, %p2;
+; O3-NEXT: prmt.b32 %r22, %r1, 0, 0x7770U;
+; O3-NEXT: selp.b32 %r23, %r22, %r11, %p1;
+; O3-NEXT: prmt.b32 %r24, %r23, %r21, 0x3340U;
+; O3-NEXT: prmt.b32 %r25, %r24, %r19, 0x5410U;
+; O3-NEXT: st.param.b32 [func_retval0], %r25;
; O3-NEXT: ret;
%cmp = icmp sgt <4 x i8> %a, %b
%r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
@@ -473,61 +489,77 @@ define <4 x i8> @test_smin(<4 x i8> %a, <4 x i8> %b) #0 {
; O0-LABEL: test_smin(
; O0: {
; O0-NEXT: .reg .pred %p<5>;
-; O0-NEXT: .reg .b32 %r<18>;
+; O0-NEXT: .reg .b32 %r<26>;
; O0-EMPTY:
; O0-NEXT: // %bb.0:
; O0-NEXT: ld.param.b32 %r2, [test_smin_param_1];
; O0-NEXT: ld.param.b32 %r1, [test_smin_param_0];
-; O0-NEXT: prmt.b32 %r3, %r2, 0, 0x7770U;
-; O0-NEXT: prmt.b32 %r4, %r1, 0, 0x7770U;
+; O0-NEXT: prmt.b32 %r3, %r2, 0, 0x8880U;
+; O0-NEXT: prmt.b32 %r4, %r1, 0, 0x8880U;
; O0-NEXT: setp.le.s32 %p1, %r4, %r3;
-; O0-NEXT: prmt.b32 %r5, %r2, 0, 0x7771U;
-; O0-NEXT: prmt.b32 %r6, %r1, 0, 0x7771U;
+; O0-NEXT: prmt.b32 %r5, %r2, 0, 0x9991U;
+; O0-NEXT: prmt.b32 %r6, %r1, 0, 0x9991U;
; O0-NEXT: setp.le.s32 %p2, %r6, %r5;
-; O0-NEXT: prmt.b32 %r7, %r2, 0, 0x7772U;
-; O0-NEXT: prmt.b32 %r8, %r1, 0, 0x7772U;
+; O0-NEXT: prmt.b32 %r7, %r2, 0, 0xaaa2U;
+; O0-NEXT: prmt.b32 %r8, %r1, 0, 0xaaa2U;
; O0-NEXT: setp.le.s32 %p3, %r8, %r7;
-; O0-NEXT: prmt.b32 %r9, %r2, 0, 0x7773U;
-; O0-NEXT: prmt.b32 %r10, %r1, 0, 0x7773U;
+; O0-NEXT: prmt.b32 %r9, %r2, 0, 0xbbb3U;
+; O0-NEXT: prmt.b32 %r10, %r1, 0, 0xbbb3U;
; O0-NEXT: setp.le.s32 %p4, %r10, %r9;
-; O0-NEXT: selp.b32 %r11, %r10, %r9, %p4;
-; O0-NEXT: selp.b32 %r12, %r8, %r7, %p3;
-; O0-NEXT: prmt.b32 %r13, %r12, %r11, 0x3340U;
-; O0-NEXT: selp.b32 %r14, %r6, %r5, %p2;
-; O0-NEXT: selp.b32 %r15, %r4, %r3, %p1;
-; O0-NEXT: prmt.b32 %r16, %r15, %r14, 0x3340U;
-; O0-NEXT: prmt.b32 %r17, %r16, %r13, 0x5410U;
-; O0-NEXT: st.param.b32 [func_retval0], %r17;
+; O0-NEXT: prmt.b32 %r11, %r2, 0, 0x7770U;
+; O0-NEXT: prmt.b32 %r12, %r2, 0, 0x7771U;
+; O0-NEXT: prmt.b32 %r13, %r2, 0, 0x7772U;
+; O0-NEXT: prmt.b32 %r14, %r2, 0, 0x7773U;
+; O0-NEXT: prmt.b32 %r15, %r1, 0, 0x7773U;
+; O0-NEXT: selp.b32 %r16, %r15, %r14, %p4;
+; O0-NEXT: prmt.b32 %r17, %r1, 0, 0x7772U;
+; O0-NEXT: selp.b32 %r18, %r17, %r13, %p3;
+; O0-NEXT: prmt.b32 %r19, %r18, %r16, 0x3340U;
+; O0-NEXT: prmt.b32 %r20, %r1, 0, 0x7771U;
+; O0-NEXT: selp.b32 %r21, %r20, %r12, %p2;
+; O0-NEXT: prmt.b32 %r22, %r1, 0, 0x7770U;
+; O0-NEXT: selp.b32 %r23, %r22, %r11, %p1;
+; O0-NEXT: prmt.b32 %r24, %r23, %r21, 0x3340U;
+; O0-NEXT: prmt.b32 %r25, %r24, %r19, 0x5410U;
+; O0-NEXT: st.param.b32 [func_retval0], %r25;
; O0-NEXT: ret;
;
; O3-LABEL: test_smin(
; O3: {
; O3-NEXT: .reg .pred %p<5>;
-; O3-NEXT: .reg .b32 %r<18>;
+; O3-NEXT: .reg .b32 %r<26>;
; O3-EMPTY:
; O3-NEXT: // %bb.0:
; O3-NEXT: ld.param.b32 %r1, [test_smin_param_0];
; O3-NEXT: ld.param.b32 %r2, [test_smin_param_1];
-; O3-NEXT: prmt.b32 %r3, %r2, 0, 0x7770U;
-; O3-NEXT: prmt.b32 %r4, %r1, 0, 0x7770U;
+; O3-NEXT: prmt.b32 %r3, %r2, 0, 0x8880U;
+; O3-NEXT: prmt.b32 %r4, %r1, 0, 0x8880U;
; O3-NEXT: setp.le.s32 %p1, %r4, %r3;
-; O3-NEXT: prmt.b32 %r5, %r2, 0, 0x7771U;
-; O3-NEXT: prmt.b32 %r6, %r1, 0, 0x7771U;
+; O3-NEXT: prmt.b32 %r5, %r2, 0, 0x9991U;
+; O3-NEXT: prmt.b32 %r6, %r1, 0, 0x9991U;
; O3-NEXT: setp.le.s32 %p2, %r6, %r5;
-; O3-NEXT: prmt.b32 %r7, %r2, 0, 0x7772U;
-; O3-NEXT: prmt.b32 %r8, %r1, 0, 0x7772U;
+; O3-NEXT: prmt.b32 %r7, %r2, 0, 0xaaa2U;
+; O3-NEXT: prmt.b32 %r8, %r1, 0, 0xaaa2U;
; O3-NEXT: setp.le.s32 %p3, %r8, %r7;
-; O3-NEXT: prmt.b32 %r9, %r2, 0, 0x7773U;
-; O3-NEXT: prmt.b32 %r10, %r1, 0, 0x7773U;
+; O3-NEXT: prmt.b32 %r9, %r2, 0, 0xbbb3U;
+; O3-NEXT: prmt.b32 %r10, %r1, 0, 0xbbb3U;
; O3-NEXT: setp.le.s32 %p4, %r10, %r9;
-; O3-NEXT: selp.b32 %r11, %r10, %r9, %p4;
-; O3-NEXT: selp.b32 %r12, %r8, %r7, %p3;
-; O3-NEXT: prmt.b32 %r13, %r12, %r11, 0x3340U;
-; O3-NEXT: selp.b32 %r14, %r6, %r5, %p2;
-; O3-NEXT: selp.b32 %r15, %r4, %r3, %p1;
-; O3-NEXT: prmt.b32 %r16, %r15, %r14, 0x3340U;
-; O3-NEXT: prmt.b32 %r17, %r16, %r13, 0x5410U;
-; O3-NEXT: st.param.b32 [func_retval0], %r17;
+; O3-NEXT: prmt.b32 %r11, %r2, 0, 0x7770U;
+; O3-NEXT: prmt.b32 %r12, %r2, 0, 0x7771U;
+; O3-NEXT: prmt.b32 %r13, %r2, 0, 0x7772U;
+; O3-NEXT: prmt.b32 %r14, %r2, 0, 0x7773U;
+; O3-NEXT: prmt.b32 %r15, %r1, 0, 0x7773U;
+; O3-NEXT: selp.b32 %r16, %r15, %r14, %p4;
+; O3-NEXT: prmt.b32 %r17, %r1, 0, 0x7772U;
+; O3-NEXT: selp.b32 %r18, %r17, %r13, %p3;
+; O3-NEXT: prmt.b32 %r19, %r18, %r16, 0x3340U;
+; O3-NEXT: prmt.b32 %r20, %r1, 0, 0x7771U;
+; O3-NEXT: selp.b32 %r21, %r20, %r12, %p2;
+; O3-NEXT: prmt.b32 %r22, %r1, 0, 0x7770U;
+; O3-NEXT: selp.b32 %r23, %r22, %r11, %p1;
+; O3-NEXT: prmt.b32 %r24, %r23, %r21, 0x3340U;
+; O3-NEXT: prmt.b32 %r25, %r24, %r19, 0x5410U;
+; O3-NEXT: st.param.b32 [func_retval0], %r25;
; O3-NEXT: ret;
%cmp = icmp sle <4 x i8> %a, %b
%r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
diff --git a/llvm/test/CodeGen/NVPTX/ld-param-sink.ll b/llvm/test/CodeGen/NVPTX/ld-param-sink.ll
new file mode 100644
index 0000000..03523a3
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/ld-param-sink.ll
@@ -0,0 +1,47 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -verify-machineinstrs | FileCheck %s
+; RUN: %if ptxas %{ llc < %s | %ptxas-verify %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+declare ptr @bar(i64)
+declare i64 @baz()
+
+define ptr @foo(i1 %cond) {
+; CHECK-LABEL: foo(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b8 %rs1, [foo_param_0];
+; CHECK-NEXT: and.b16 %rs2, %rs1, 1;
+; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0;
+; CHECK-NEXT: { // callseq 0, 0
+; CHECK-NEXT: .param .b64 retval0;
+; CHECK-NEXT: call.uni (retval0), baz, ();
+; CHECK-NEXT: ld.param.b64 %rd2, [retval0];
+; CHECK-NEXT: } // callseq 0
+; CHECK-NEXT: @%p1 bra $L__BB0_2;
+; CHECK-NEXT: // %bb.1: // %bb
+; CHECK-NEXT: { // callseq 1, 0
+; CHECK-NEXT: .param .b64 param0;
+; CHECK-NEXT: .param .b64 retval0;
+; CHECK-NEXT: st.param.b64 [param0], %rd2;
+; CHECK-NEXT: call.uni (retval0), bar, (param0);
+; CHECK-NEXT: } // callseq 1
+; CHECK-NEXT: $L__BB0_2: // %common.ret
+; CHECK-NEXT: st.param.b64 [func_retval0], 0;
+; CHECK-NEXT: ret;
+entry:
+ %call = call i64 @baz()
+ br i1 %cond, label %common.ret, label %bb
+
+bb:
+ %tmp = call ptr @bar(i64 %call)
+ br label %common.ret
+
+common.ret:
+ ret ptr null
+}
diff --git a/llvm/test/CodeGen/NVPTX/local-stack-frame.ll b/llvm/test/CodeGen/NVPTX/local-stack-frame.ll
index 5c30173..ae069cf 100644
--- a/llvm/test/CodeGen/NVPTX/local-stack-frame.ll
+++ b/llvm/test/CodeGen/NVPTX/local-stack-frame.ll
@@ -114,15 +114,14 @@ define void @foo3(i32 %a) {
; PTX64-NEXT: .reg .b64 %SP;
; PTX64-NEXT: .reg .b64 %SPL;
; PTX64-NEXT: .reg .b32 %r<2>;
-; PTX64-NEXT: .reg .b64 %rd<5>;
+; PTX64-NEXT: .reg .b64 %rd<4>;
; PTX64-EMPTY:
; PTX64-NEXT: // %bb.0:
; PTX64-NEXT: mov.b64 %SPL, __local_depot2;
; PTX64-NEXT: ld.param.b32 %r1, [foo3_param_0];
; PTX64-NEXT: add.u64 %rd2, %SPL, 0;
-; PTX64-NEXT: mul.wide.s32 %rd3, %r1, 4;
-; PTX64-NEXT: add.s64 %rd4, %rd2, %rd3;
-; PTX64-NEXT: st.local.b32 [%rd4], %r1;
+; PTX64-NEXT: mad.wide.s32 %rd3, %r1, 4, %rd2;
+; PTX64-NEXT: st.local.b32 [%rd3], %r1;
; PTX64-NEXT: ret;
%local = alloca [3 x i32], align 4
%1 = getelementptr inbounds i32, ptr %local, i32 %a
diff --git a/llvm/test/CodeGen/NVPTX/vector-loads.ll b/llvm/test/CodeGen/NVPTX/vector-loads.ll
index e16fc74..6f0dff7 100644
--- a/llvm/test/CodeGen/NVPTX/vector-loads.ll
+++ b/llvm/test/CodeGen/NVPTX/vector-loads.ll
@@ -154,7 +154,7 @@ define void @foo_complex(ptr nocapture readonly align 16 dereferenceable(1342177
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<4>;
; CHECK-NEXT: .reg .b32 %r<8>;
-; CHECK-NEXT: .reg .b64 %rd<6>;
+; CHECK-NEXT: .reg .b64 %rd<5>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [foo_complex_param_0];
@@ -166,12 +166,11 @@ define void @foo_complex(ptr nocapture readonly align 16 dereferenceable(1342177
; CHECK-NEXT: shl.b32 %r6, %r1, 1;
; CHECK-NEXT: or.b32 %r7, %r5, %r6;
; CHECK-NEXT: cvt.u64.u32 %rd2, %r7;
-; CHECK-NEXT: mul.wide.u32 %rd3, %r3, 131072;
-; CHECK-NEXT: add.s64 %rd4, %rd1, %rd3;
-; CHECK-NEXT: add.s64 %rd5, %rd4, %rd2;
-; CHECK-NEXT: ld.v2.b8 {%rs1, %rs2}, [%rd5+128];
+; CHECK-NEXT: mad.wide.u32 %rd3, %r3, 131072, %rd1;
+; CHECK-NEXT: add.s64 %rd4, %rd3, %rd2;
+; CHECK-NEXT: ld.v2.b8 {%rs1, %rs2}, [%rd4+128];
; CHECK-NEXT: max.u16 %rs3, %rs1, %rs2;
-; CHECK-NEXT: st.b8 [%rd5+129], %rs3;
+; CHECK-NEXT: st.b8 [%rd4+129], %rs3;
; CHECK-NEXT: ret;
%t0 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !1
%t1 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()