; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_35 -verify-machineinstrs | FileCheck %s ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_35 | %ptxas-verify %} target triple = "nvptx64-nvidia-cuda" define i16 @test_v2i8(i16 %a) { ; CHECK-LABEL: test_v2i8( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<5>; ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b16 %rs1, [test_v2i8_param_0]; ; CHECK-NEXT: cvt.s16.s8 %rs2, %rs1; ; CHECK-NEXT: shr.s16 %rs3, %rs1, 8; ; CHECK-NEXT: add.s16 %rs4, %rs2, %rs3; ; CHECK-NEXT: cvt.u32.u16 %r1, %rs4; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; %v = bitcast i16 %a to <2 x i8> %r0 = extractelement <2 x i8> %v, i64 0 %r1 = extractelement <2 x i8> %v, i64 1 %r0i = sext i8 %r0 to i16 %r1i = sext i8 %r1 to i16 %r01 = add i16 %r0i, %r1i ret i16 %r01 } define i1 @test_v2i8_load(ptr %a) { ; CHECK-LABEL: test_v2i8_load( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<2>; ; CHECK-NEXT: .reg .b16 %rs<7>; ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [test_v2i8_load_param_0]; ; CHECK-NEXT: ld.v2.b8 {%rs1, %rs2}, [%rd1]; ; CHECK-NEXT: or.b16 %rs5, %rs1, %rs2; ; CHECK-NEXT: and.b16 %rs6, %rs5, 255; ; CHECK-NEXT: setp.eq.b16 %p1, %rs6, 0; ; CHECK-NEXT: selp.b32 %r1, -1, 0, %p1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; %v = load <2 x i8>, ptr %a, align 4 %r0 = extractelement <2 x i8> %v, i64 0 %r1 = extractelement <2 x i8> %v, i64 1 %icmp = icmp eq i8 %r0, 0 %icmp3 = icmp eq i8 %r1, 0 %select = select i1 %icmp, i1 %icmp3, i1 false ret i1 %select } define i16 @test_v4i8(i32 %a) { ; CHECK-LABEL: test_v4i8( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<8>; ; CHECK-NEXT: .reg .b32 %r<6>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [test_v4i8_param_0]; ; CHECK-NEXT: cvt.s8.s32 %rs1, %r1; ; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x9991U; ; CHECK-NEXT: cvt.u16.u32 %rs2, %r2; ; CHECK-NEXT: prmt.b32 %r3, %r1, 0, 0xaaa2U; ; CHECK-NEXT: cvt.u16.u32 %rs3, %r3; ; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0xbbb3U; ; CHECK-NEXT: cvt.u16.u32 %rs4, %r4; ; CHECK-NEXT: add.s16 %rs5, %rs1, %rs2; ; CHECK-NEXT: add.s16 %rs6, %rs3, %rs4; ; CHECK-NEXT: add.s16 %rs7, %rs5, %rs6; ; CHECK-NEXT: cvt.u32.u16 %r5, %rs7; ; CHECK-NEXT: st.param.b32 [func_retval0], %r5; ; CHECK-NEXT: ret; %v = bitcast i32 %a to <4 x i8> %r0 = extractelement <4 x i8> %v, i64 0 %r1 = extractelement <4 x i8> %v, i64 1 %r2 = extractelement <4 x i8> %v, i64 2 %r3 = extractelement <4 x i8> %v, i64 3 %r0i = sext i8 %r0 to i16 %r1i = sext i8 %r1 to i16 %r2i = sext i8 %r2 to i16 %r3i = sext i8 %r3 to i16 %r01 = add i16 %r0i, %r1i %r23 = add i16 %r2i, %r3i %r = add i16 %r01, %r23 ret i16 %r } define i32 @test_v4i8_s32(i32 %a) { ; CHECK-LABEL: test_v4i8_s32( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<9>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [test_v4i8_s32_param_0]; ; CHECK-NEXT: cvt.s32.s8 %r2, %r1; ; CHECK-NEXT: prmt.b32 %r3, %r1, 0, 0x9991U; ; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0xaaa2U; ; CHECK-NEXT: prmt.b32 %r5, %r1, 0, 0xbbb3U; ; CHECK-NEXT: add.s32 %r6, %r2, %r3; ; CHECK-NEXT: add.s32 %r7, %r4, %r5; ; CHECK-NEXT: add.s32 %r8, %r6, %r7; ; CHECK-NEXT: st.param.b32 [func_retval0], %r8; ; CHECK-NEXT: ret; %v = bitcast i32 %a to <4 x i8> %r0 = extractelement <4 x i8> %v, i64 0 %r1 = extractelement <4 x i8> %v, i64 1 %r2 = extractelement <4 x i8> %v, i64 2 %r3 = extractelement <4 x i8> %v, i64 3 %r0i = sext i8 %r0 to i32 %r1i = sext i8 %r1 to i32 %r2i = sext i8 %r2 to i32 %r3i = sext i8 %r3 to i32 %r01 = add i32 %r0i, %r1i %r23 = add i32 %r2i, %r3i %r = add i32 %r01, %r23 ret i32 %r } define i32 @test_v4i8_u32(i32 %a) { ; CHECK-LABEL: test_v4i8_u32( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<9>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [test_v4i8_u32_param_0]; ; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x7771U; ; CHECK-NEXT: prmt.b32 %r3, %r1, 0, 0x7772U; ; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x7773U; ; CHECK-NEXT: and.b32 %r5, %r1, 255; ; CHECK-NEXT: add.s32 %r6, %r5, %r2; ; CHECK-NEXT: add.s32 %r7, %r3, %r4; ; CHECK-NEXT: add.s32 %r8, %r6, %r7; ; CHECK-NEXT: st.param.b32 [func_retval0], %r8; ; CHECK-NEXT: ret; %v = bitcast i32 %a to <4 x i8> %r0 = extractelement <4 x i8> %v, i64 0 %r1 = extractelement <4 x i8> %v, i64 1 %r2 = extractelement <4 x i8> %v, i64 2 %r3 = extractelement <4 x i8> %v, i64 3 %r0i = zext i8 %r0 to i32 %r1i = zext i8 %r1 to i32 %r2i = zext i8 %r2 to i32 %r3i = zext i8 %r3 to i32 %r01 = add i32 %r0i, %r1i %r23 = add i32 %r2i, %r3i %r = add i32 %r01, %r23 ret i32 %r } define i16 @test_v8i8(i64 %a) { ; CHECK-LABEL: test_v8i8( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<16>; ; CHECK-NEXT: .reg .b32 %r<10>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_v8i8_param_0]; ; CHECK-NEXT: cvt.s8.s32 %rs1, %r1; ; CHECK-NEXT: prmt.b32 %r3, %r1, 0, 0x9991U; ; CHECK-NEXT: cvt.u16.u32 %rs2, %r3; ; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0xaaa2U; ; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; ; CHECK-NEXT: prmt.b32 %r5, %r1, 0, 0xbbb3U; ; CHECK-NEXT: cvt.u16.u32 %rs4, %r5; ; CHECK-NEXT: cvt.s8.s32 %rs5, %r2; ; CHECK-NEXT: prmt.b32 %r6, %r2, 0, 0x9991U; ; CHECK-NEXT: cvt.u16.u32 %rs6, %r6; ; CHECK-NEXT: prmt.b32 %r7, %r2, 0, 0xaaa2U; ; CHECK-NEXT: cvt.u16.u32 %rs7, %r7; ; CHECK-NEXT: prmt.b32 %r8, %r2, 0, 0xbbb3U; ; CHECK-NEXT: cvt.u16.u32 %rs8, %r8; ; CHECK-NEXT: add.s16 %rs9, %rs1, %rs2; ; CHECK-NEXT: add.s16 %rs10, %rs3, %rs4; ; CHECK-NEXT: add.s16 %rs11, %rs5, %rs6; ; CHECK-NEXT: add.s16 %rs12, %rs7, %rs8; ; CHECK-NEXT: add.s16 %rs13, %rs9, %rs10; ; CHECK-NEXT: add.s16 %rs14, %rs11, %rs12; ; CHECK-NEXT: add.s16 %rs15, %rs13, %rs14; ; CHECK-NEXT: cvt.u32.u16 %r9, %rs15; ; CHECK-NEXT: st.param.b32 [func_retval0], %r9; ; CHECK-NEXT: ret; %v = bitcast i64 %a to <8 x i8> %r0 = extractelement <8 x i8> %v, i64 0 %r1 = extractelement <8 x i8> %v, i64 1 %r2 = extractelement <8 x i8> %v, i64 2 %r3 = extractelement <8 x i8> %v, i64 3 %r4 = extractelement <8 x i8> %v, i64 4 %r5 = extractelement <8 x i8> %v, i64 5 %r6 = extractelement <8 x i8> %v, i64 6 %r7 = extractelement <8 x i8> %v, i64 7 %r0i = sext i8 %r0 to i16 %r1i = sext i8 %r1 to i16 %r2i = sext i8 %r2 to i16 %r3i = sext i8 %r3 to i16 %r4i = sext i8 %r4 to i16 %r5i = sext i8 %r5 to i16 %r6i = sext i8 %r6 to i16 %r7i = sext i8 %r7 to i16 %r01 = add i16 %r0i, %r1i %r23 = add i16 %r2i, %r3i %r45 = add i16 %r4i, %r5i %r67 = add i16 %r6i, %r7i %r0123 = add i16 %r01, %r23 %r4567 = add i16 %r45, %r67 %r = add i16 %r0123, %r4567 ret i16 %r }